\[\\\] When \(w_i=\frac{1}{K}\), \(f(\hat{x})=\sum_{i=1}^K\frac{1}{K}y_i=\bar{y_{i,i\in\{1,2,,K\}}}\), takes the average among its K closest samples (K nearest neighbours), has the smoothing effect among neighbours to lower the impact of outliers.
Write a R module with Gaussian Kernal
rm(list=ls())
set.seed(123)
library(modules)
#write module KernalRegression (kr)
kr <- module({
import('stats')
# given any x and h, calculate the kernal K
K <- function(d,h){
return(dnorm(x=d/h,mean=0,sd=1))
}
#input x and calculated normalized weights
weights <- function(x,xi,h){
w=K(x-xi,h)/h
w=w/sum(w)
return(w)
}
#calculate the weights use weights function and calculate y
knsmooth <- function(x_new,x,y,h){
w=sapply(x_new, function(t) weights(x,t,h))
yks <- t(w) %*% y
return(yks)
}
})
Test it with simulations:
x <- runif(100,-3,3)
eps <- rnorm(length(x),0,1)
y <- 2*x^3+3*x^2+4*x+eps
y <- y-mean(y)
#create x_new for prediction and plot
x_new= seq(-3,3,length.out = length(x))
#use the knsmooth function from kr module on simulated x and y and predict x_new
yks_0 <- kr$knsmooth(x_new,x,y,0.01)
yks_1 <- kr$knsmooth(x_new,x,y,0.1)
yks_2 <- kr$knsmooth(x_new,x,y,0.5)
yks_3 <- kr$knsmooth(x_new,x,y,1)
#plot the fitted curve
plot(x,y,pch=20,main='KernelSmoother for Cubic x-y with Gaussian Error')
lines(x_new,yks_0,type='l',col='blue',lwd=2)
lines(x_new,yks_1,type='l',col='red',lwd=2)
lines(x_new,yks_2,type='l',col='orange',lwd=4)
lines(x_new,yks_3,type='l',col='green',lwd=2)
legend('topleft',c('h=0.01','h=0.1','h=0.5','h=1'),lty=1,col=c('blue','red','orange','green'))
## Question 3 Cross Validation
#create training and test dataset
x <- runif(100,0,1)
eps <- rnorm(length(x),0,1)
y <- 2*x^3+3*x^2+4*x+eps
train <- as.data.frame(cbind(x,y))
xtest <- runif(100,0,1)
epstest <- rnorm(length(xtest),0,1)
ytest <- 2*xtest^3+3*xtest^2+4*xtest+epstest
test <- as.data.frame(cbind(xtest,ytest),colnames=c(x,y))
# write a function cv for cross validation
# input training set, test test and h
# output the kernal regression result of test set and error for each test y
cv <- function(train,test,h){
yks_test <- kr$knsmooth(test$x,train$x,train$y,h)
err <- test$y-yks_test
return(list(yks_test=yks_test,err=err))
}
# try a bunch of different h for cross validation error
h <- seq(0.001,1,length.out = 100)
err_h = sapply(h,function (t) cv(train,test,t)$err)
# the sum of errors in absolute values in for each h
plot(h,colMeans((err_h)^2),main='Cross Validation to choose h')
h[which.min(colMeans((err_h)^2))]
## [1] 0.08172727
#create noisy x1 and not so noisy x2
x1 <- runif(500,-5,5)
eps1 <- rnorm(length(x1),0,2)
x2 <- runif(500,-5,5)
eps2 <- rnorm(500,0,0.1)
#create wiggly function and smoothfunction for x1 and x2
y1w <- sin(x1)+eps1
y1s <- 2*x1^2+eps1
y2w <- sin(x2)+eps2
y2s <- 2*x2^2+eps2
par(mfrow=c(2,2))
plot(x1,y1w,main='noisy wigly')
plot(x2,y2w,main='not noisy wigly')
plot(x1,y1s,main='noisy smooth')
plot(x2,y2s,main='not noisy smooth')
par(mfrow=c(1,1))
# create the training and test dataset for x1 and x2, wiggly and smooth
xy1w <- as.data.frame(cbind(x1,y1w),colnames=c(x,y))
train1w <- xy1w[1:400,]
test1w <- xy1w[401:500,]
xy1s <- as.data.frame(cbind(x1,y1s),colnames=c(x,y))
train1s <- xy1s[1:400,]
test1s <- xy1s[401:500,]
xy2w <- as.data.frame(cbind(x2,y2w),colnames=c(x,y))
train2w <- xy2w[1:400,]
test2w <- xy2w[401:500,]
xy2s <- as.data.frame(cbind(x2,y2s),colnames=c(x,y))
train2s <- xy2s[1:400,]
test2s <- xy2s[401:500,]
# create a bunch of h we want to choose from
h=seq(0.001,3,length.out = 100)
# choose the best h for x1 with wiggly function by minimizing the cross validation error
err_1w = sapply(h,function (t) cv(train1w,test1w,t)$err)
hbest_1w=h[which.min(colMeans(err_1w^2))]
hbest_1w
## [1] 0.5462727
# choose the best h for x2 with wiggly function by minimizing the cross validation error
err_2w = sapply(h,function (t) cv(train2w,test2w,t)$err)
hbest_2w=h[which.min(colMeans(err_2w^2))]
hbest_2w
## [1] 0.1221717
# choose the best h for x1 with smooth function by minimizing the cross validation error
err_1s = sapply(h,function (t) cv(train1s,test1s,t)$err)
hbest_1s=h[which.min(colMeans(err_1s^2))]
hbest_1s
## [1] 0.2433434
# choose the best h for x2 with smooth function by minimizing the cross validation error
err_2s = sapply(h,function (t) cv(train2s,test2s,t)$err)
hbest_2s=h[which.min(colMeans(err_2s^2))]
hbest_2s
## [1] 0.03129293
#plot the smoother and with its best bindwidth
x_grid=seq(-5,5,length.out = 500)
par(mfrow=c(2,2))
plot(x1,y1w,main='noisy wigly,h=1.06',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train1w$x,train1w$y,hbest_1w),col='red',lwd=2)
plot(x2,y2w,main='not noisy wigly,h=0.24',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train2w$x,train2w$y,hbest_2w),col='red',lwd=2)
plot(x1,y1s,main='noisy smooth,h=0.12',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train1s$x,train1s$y,hbest_1s),col='red',lwd=2)
plot(x2,y2s,main='not noisy smooth,h=0.03',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train2s$x,train2s$y,hbest_2s),col='red',lwd=2)
par(mfrow=c(1,1))
Now that \[\hat{y^{-i}}=argmin_{j\neq i} w_j(y_j-\hat{y_j^{-i}})^2\] Define \[ Z=\left\{\begin{array}{cc} y_j & j\neq i \\ \hat{y_j^{-i}} & j=i \end{array} \right. \] Then, \(\hat{y_j^{-i}}\) is also \[\hat{y^{-i}}=argmin_{j\neq i} w_j(z_j-\hat{y_j^{-i}})^2\] Thus \[\hat{y^{-i}}= \sum_j H_{ij}z_j\] Subtracting \(\hat{y^{-i}}\) from \(\hat{y_i}\) we have, \[\hat{y_i}-\hat{y^{-i}}=\sum_j H_{ij}(y_j-z_j)=H_{ii}(y_i-\hat{y_j^{-i}})\] The last step holds since \(Y\) and \(Z\) only have one term different at \(j=i\).
Collecting the terms and write the above equation, we have \[\hat{y^{-i}}=\hat{y_i}-H_{ii}y_i+H_{ii}\hat{y^{-i}}\]
i.e., \[(1-H_{ii})\hat{y^{-i}}=\hat{y_i}-H_{ii}y_i\]
Thus, \[\begin{align} LOOCV &=\sum_i (y_i-\hat{y^{-i}})^2 \\ &= \sum_i \left(y_i-\frac{1}{1-H_{ii}}(\hat{y_i}-H_{ii}y_i)\right)^2 \\ &= \sum_i \left(\frac{y_i-\hat{y_i}}{1-H_{ii}}\right)^2 \end{align}\]To implement it,
# write a function loocv
# input x y and parameter h
# output kernel smoothing result of test y and loocv error for each y in test set
loocv <- function(x,y,h){
yks=kr$knsmooth(x,x,y,h)
H= sapply(x, function(t) kr$weights(x,t,h))
loocverr <- sapply(c(1:dim(H)[1]), function(i) ((y[i]-yks[i])/(1-diag(H)[i]))^2)
return(list(yks=yks,loocverr=loocverr))
}
#use the loocv funtion on the smooth function of x2 and the h set we created bfore and choose the h based on smallest loocv error
loocverr_x2y2s_h=sapply(h,function(t) loocv(x2,y2s,t)$loocverr)
loocverr_x2y2s_h=colSums(loocverr_x2y2s_h)/dim(loocverr_x2y2s_h)[1]
hbest_2s_loocv=h[which.min(loocverr_x2y2s_h)]
hbest_2s_loocv
## [1] 0.03129293
\[g_x(x_i,a)=a_0+\sum_{k=1}^D a_k(x-x_i)^k\] where \(a=\{a_0,a_1,,,,,a_D\}\) \[\hat{a}=argmin \sum_i^n w_i (y_i-g_x(x_i,a))^2\] Define matrix \(R\) and \(W\) \[ R=\left[\begin{array}{ccccc} 1 &(x_1-x)^{1} &(x_1-x)^{2} &... & (x_1-x)^{D} \\ 1 &(x_2-x)^{1} &(x_2-x)^{2} &... & (x_2-x)^{D} \\ ... &... &... &... &... \\ 1 &(x_n-x)^{1} &(x_n-x)^{2} &... & (x_n-x)^{D} \\ \end{array} \right] \]
\[ W=\left[\begin{array}{ccccc} w_1 &0 &0 &... & 0 \\ 0 &w2 &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & w_n \\ \end{array} \right] \] Then, \[a=argmin (y-Ra)^T W (y-Ra)\] Take the derivative w.r.t to \(a\) and set to \(0\) \[\frac{\partial(y-Ra)^T W (y-Ra) }{\partial a}=-2R^Twy+2R^TwRa=0\] gives us \[\hat{a}= (R^T W R)^{-1} R^T W y\]
And \(\hat{f(x)}=g(x,a)=a_0=e_1^T (R^TWR)^{-1}R^TWy\)
When \(p=1\), \[ R=\left[\begin{array}{cc} 1 &(x_1-x) \\ 1 &(x_2-x) \\ ... &... \\ 1 &(x_n-x) \\ \end{array} \right] \] \[ W=\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \]
Then we have \[\begin{align} R^T W R &= \left[\begin{array}{ccccc} 1 &1 &1 &... &1 \\ (x_1-x) &(x_2-x) & &(x_3-x)... &(x_n-x) \end{array} \right] \\ & \quad \quad *\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \\ & \quad \quad *\left[\begin{array}{cc} 1 &(x_1-x) \\ 1 &(x_2-x) \\ ... &... \\ 1 &(x_n-x) \\ \end{array} \right] \\ &= \left[\begin{array}{cc} \sum_i K(\frac{x_i-x}{h}) &\sum_i K(\frac{x_i-x}{h})(x_i-x) \\ \sum_i K(\frac{x_i-x}{h})(x_i-x) &\sum_iK(\frac{x_i-x}{h})(x_i-x)^2 \end{array} \right] \\ &= \left[\begin{array}{cc} \sum_iw_i &s_1 \\ s_1 &s_2 \end{array} \right] \end{align}\] \[\begin{align} R^T W y= &= \left[\begin{array}{ccccc} 1 &1 &1 &... &1 \\ (x_1-x) &(x_2-x) & &(x_3-x)... &(x_n-x) \end{array} \right] \\ & \quad \quad *\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \\ & \quad \quad *\left[\begin{array}{c} y_1 \\ y_2 \\ ... \\ y_n \\ \end{array} \right] \\ &= \left[\begin{array}{c} \sum_i K(\frac{x_i-x}{h})y_i \\ \sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \end{align}\] Plug into \(\hat{a}\), \[\begin{align} \hat{a} &= \left[\begin{array}{cc} \sum_iw_i &s_1 \\ s_1 &s_2 \end{array} \right] ^{-1} \left[\begin{array}{c} \sum_i K(\frac{x_i-x}{h})y_i \\ \sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \\ &= \left[\begin{array}{c} \frac{s_2}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})y_i-\frac{s_1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \\ -\frac{s_1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})y_i+\frac{1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \end{align}\]\[\hat{f(x)}=a_0=a[1,]=\frac{\sum_i K(\frac{x_i-x}{h})(s_2-s_1(x_i-x))y_i}{(\sum_iw_i)s_2-s_1^2}\]
which means \(\hat{f(x)}=a_0=a[1,]=\frac{\sum_i w_i y_i}{s_2\sum_iw_i-s_1^2}\) with unnormalized weights \(w_i=K(\frac{x_i-x}{h})(s_2-s_1(x_i-x)\)
\[\hat{f(x)}=a_0=e_1^T (R^TWR)^{-1}R^TWy = \frac{\sum_i K(\frac{x_i-x}{h})(s_2-s_1(x_i-x))}{(\sum_iw_i)s_2-s_1^2} y_i \] call the new weights \(l_i\) \[\hat{f(x)}=\sum_il_iy_i=l^Ty\] \[E(\hat{f(x)})=\sum_il_if(x_i)\] \[var(\hat{f(x)})=\sigma^2 ||l||_2^2\] If we assume normality of the residuals \(\epsilon\), \[\hat{f(x)} \sim N(\sum_il_i f(x_i),\sigma^2 ||l||_2^2)\]
so \[ E(\hat{\sigma^2}) =\sigma^2 +\frac{(f(x)-Hf(x))^T(f(x)-Hf(x))}{n-2tr(H)+2tr(H^TH)}\]
If the true function is close to the smoothing of the function, the bias is small. Also, if the n is very big, the bias can be neglected.
rm(list=ls())
library(modules)
set.seed(123)
setwd("/Users/qiaohui.lin/Desktop/StatModeling2/")
data=read.csv("utilities.csv",header = TRUE,sep=',')
y=data$gasbill/data$billingdays
x=data$temp
x_reg= cbind(rep(1,length(x)),x)
#local linear regression
llr <- module({
import('stats')
# calculate Kernal, input distannce and bandwidth, return kernal function value
K <- function(d,h){
return(dnorm(x=d/h,mean=0,sd=1))
}
#calculate s1 and s2
s2 <- function(x,xi,h){
return(sum(K(xi-x,h)*(xi-x)^2))
}
s1 <- function(x,xi,h){
return(sum(K(xi-x,h)*(xi-x)))
}
#calculate normalized weights
weights <- function(x,xi,h){
w <- K(xi-x,h) * (s2(x,xi,h)-(xi-x)*s1(x,xi,h))
w <- w/sum(w)
return(w)
}
#calculate prediction for one single points
llr_single <- function(y,x,xi,h){
w <- weights(x,xi,h)
y_pred <- t(w) %*% y
return(y_pred)
}
#calculate prediction for all points
llr_all <- function(y,x,t,h){
y_pred_all=sapply(t,function(i) llr_single(y,x,i,h))
return(y_pred_all)
}
})
#test with x[1] and h=1
y_pred_1=llr$llr_single(y,x,x[1],1)
y_pred_1
## [,1]
## [1,] 3.969505
y_pred=llr$llr_all(y,x,x,1)
head(y_pred,10)
## [1] 3.9695052 5.3426476 4.8696459 2.8793552 2.2915127 1.7903176 0.8970022
## [8] 0.6122573 0.6122573 0.7516287
#choose h by loocv,
h=seq(0.5,10,length.out = 100)
#loocv function input x y bandwidth h, return predition on test set and error for each testset y
llr_loocv <- function(y,x,h){
ypred=llr$llr_all(y,x,x,h)
H= sapply(x,function(t) llr$weights(x,t,h))
err <- sapply(c(1:dim(H)[1]), function(i) ((y[i]-ypred[i])/(1-diag(H)[i]))^2 )
return(list(ypred=ypred,err=err))
}
# choose the best h by applying loocv
llr_loocv_err=sapply(h,function(t) llr_loocv(y,x,t)$err)
llr_loocv_err_h=colSums(llr_loocv_err)
hbest_llr_loocv=h[which.min(llr_loocv_err_h)]
hbest_llr_loocv
## [1] 6.833333
#plug in the best h
y_pred=llr$llr_all(y,x,x,hbest_llr_loocv)
head(y_pred,10)
## [1] 4.9222837 6.0184671 5.1589377 2.9884389 2.4797174 1.3284350 0.9724587
## [8] 0.7261017 0.7261017 1.0871194
#plot prediction
plot(x,y_pred,col='red',pch=8,main='Local Linear Regression of y on x',xlab='x',ylab='y')
#create a new grid for x and its predictions for plotting the line
x_new=seq(5,80,by=0.1)
y_newpred=llr$llr_all(y,x,x_new,hbest_llr_loocv)
lines(x_new,y_newpred,col='red')
points(x,y,pch=20)
legend('topright',c('predicted','true'),lty=1,col=c('red','black'))
#plot residuals
plot(x,y-y_pred,main='Local Linear Regression Residuals',xlab='x',ylab='residual',ylim=c(-3,3))
#homoscesdasticity does not hold, looks like residuals get smaller when x gets bigger
#adjust the weights according to the variance
We are going to adjust the weights according to the variance observe the residual with respect to x. The new weights: \[w_i\propto K(x_i,x) *1/\sigma^2_i\]
#adjust the weights according to the variance
#observe the residual with respect to x
plot(x,log((y-y_pred)^2),main='log of Residual Square',xlab='x',ylab='log of residual^2')
# we guess a function of variance in the form of
x_qm=cbind(x,x^2)
lm_guess=lm(log((y-y_pred)^2)~x_qm)
lm_guess
##
## Call:
## lm(formula = log((y - y_pred)^2) ~ x_qm)
##
## Coefficients:
## (Intercept) x_qmx x_qm
## -1.479810 0.044177 -0.001261
#guess
a=lm_guess$coefficients[1]
b=lm_guess$coefficients[2]
c=lm_guess$coefficients[3]
#\sigma^2=exp(0.67-0.08x)
llr_hetero <- module({
import('stats')
# calculate Kernal, input distannce and bandwidth, return kernal function value
K <- function(d,h){
return(dnorm(x=d/h,mean=0,sd=1))
}
#calculate s1 and s2
s2 <- function(x,xi,h){
return(sum(K(xi-x,h)*(xi-x)^2))
}
s1 <- function(x,xi,h){
return(sum(K(xi-x,h)*(xi-x)))
}
#calculate normalized weights
weights_hetero <- function(x,xi,h,a,b,c){
w <- K(xi-x,h)/(exp(a+b*x+c*x^2)) * (s2(x,xi,h)-(xi-x)*s1(x,xi,h))
w <- w/sum(w)
return(w)
}
#calculate prediction for one single points
llr_single_hetero <- function(y,x,xi,h,a,b,c){
w <- weights_hetero(x,xi,h,a,b,c)
y_pred <- t(w) %*% y
return(y_pred)
}
#calculate prediction for all points
llr_all_hetero <- function(y,x,t,h,a,b,c){
y_pred_all=sapply(t,function(i) llr_single_hetero(y,x,i,h,a,b,c))
return(y_pred_all)
}
})
llr_hetero_loocv <- function(y,x,h,a,b,c){
ypred=llr_hetero$llr_all_hetero(y,x,x,h,a,b,c)
H= sapply(x,function(t) llr_hetero$weights_hetero(x,t,h,a,b,c))
err <- sapply(c(1:dim(H)[1]), function(i) ((y[i]-ypred[i])/(1-diag(H)[i]))^2 )
return(list(ypred=ypred,err=err))
}
# choose the best h by applying loocv
llr_loocv_err_hetero=sapply(h,function(t) llr_hetero_loocv(y,x,t,a,b,c)$err)
llr_loocv_err_h_hetero=colSums(llr_loocv_err_hetero)
hbest_llr_loocv_hetero=h[which.min(llr_loocv_err_h_hetero)]
hbest_llr_loocv_hetero
## [1] 4.242424
#plug in the best h
y_pred_hetero=llr_hetero$llr_all_hetero(y,x,x,hbest_llr_loocv_hetero,a,b,c)
head(y_pred_hetero,10)
## [1] 4.8792355 5.9941894 5.0502596 2.8078812 2.2620922 1.1594569 0.8026374
## [8] 0.7095204 0.7095204 0.8953889
#plot prediction
plot(x,y_pred_hetero,col='red',pch=8,main='Local Linear Regression,heteroscadesticity adjusted weight',xlab='x',ylab='y')
#create a new grid for x and its predictions for plotting the line
x_new=seq(5,80,by=0.1)
y_newpred_hetero=llr_hetero$llr_all_hetero(y,x,x_new,hbest_llr_loocv_hetero,a,b,c)
lines(x_new,y_newpred_hetero,col='red')
points(x,y,pch=20)
legend('topright',c('predicted','true'),lty=1,col=c('red','black'))
#plot residuals
plot(x,y-y_pred_hetero,main='Local Linear Regression Residuals,,heteroscadesticity adjusted weight',xlab='x',ylab='residual')
plot(x,log((y-y_pred_hetero)^2),main='Local Linear Regression Residuals,,heteroscadesticity adjusted weight',xlab='x',ylab='residual')
The CI for original fit
#estimate sigma^2
n=length(y)
#calculate normalized weights l and its l2 norm
ls <- sapply(x, function(t) llr$weights(x,t,hbest_llr_loocv))
H=t(ls)
sig2 = sum((y-y_pred)^2)/(n-2*sum(diag(H))+sum(diag(t(H)%*%H)))
l2=sqrt(colSums(ls^2))
#var(f) is sigma^2 * ||l||_2^2
sig2f=l2^2*sig2
#gaussian 95% critical value 1.96
y_ub=y_pred+1.96*sqrt(sig2f)
y_lb=y_pred-1.96*sqrt(sig2f)
#do the same thing for the new x grid we create, to plot the line
ls_new=sapply(x_new, function(t) llr$weights(x,t,hbest_llr_loocv))
l2_new=sqrt(colSums(ls_new^2))
sig2f_new=l2_new^2*sig2
y_ub_new=y_newpred+1.96*sqrt(sig2f_new)
y_lb_new=y_newpred-1.96*sqrt(sig2f_new)
plot(x,y_pred,pch=8,col='red',main='95% CI for Predicted Value')
lines(x_new,y_newpred,col='red')
lines(x_new,y_ub_new,col='blue')
lines(x_new,y_lb_new,col='blue')
points(x,y,pch=20)
library(mvtnorm)
N=1000
x <- rnorm(N,0,1)
m <- rep(0,N)
#GP of 0 mean, Sqaure exponential Covariance for unit interval
gp_es <- function(x,n,b,tau1sq, tau2sq){
m <- rep(0,length(x))
C <- matrix(0,length(x),length(x))
for(i in 1:length(x)) {
for(j in 1:length(x)) {
C[i,j] <- tau1sq*exp(-0.5*{(x[i]-x[j])/b}^2) + tau2sq*(x[i]==x[j])
}
}
fx=rmvnorm(n,mean=m,sigma=C)
return(fx)
}
gp_m52 <- function(x,n,b,tau1sq, tau2sq){
m <- rep(0,length(x))
C <- matrix(0,length(x),length(x))
for(i in 1:length(x)) {
for(j in 1:length(x)) {
d=sqrt((x[i]-x[j])^2)
C[i,j] <- tau1sq*(1+sqrt(5)*d/b+5*d^2/(3*b^2))*exp(-sqrt(5)*d/b)+tau2sq*(x[i]==x[j])
}
}
fx=rmvnorm(n,mean=m,sigma=C)
return(fx)
}
Varying \(b\) in \((0.01,0.1,0.5,1)\)
n=3
par(mfrow=c(4,1))
for (b in c(0.01,0.1,0.5,1)){
gpes_fit=gp_es(x,n,b,0.5,10^(-6))
plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square b=",b),type='l',ylim=c(-2,2))
points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}
n=3
par(mfrow=c(4,1))
for (b in c(0.01,0.1,0.5,1)){
gpm52_fit=gp_m52(x,n,b,0.5,10^(-6))
plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 b=",b),type='l',ylim=c(-2,2))
points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}
Varying \(\tau_1^2\) in \((0.01,0.1,0.5,1)\)
n=3
par(mfrow=c(4,1))
for (tau1sq in c(0.01,0.1,0.5,1)){
gpes_fit=gp_es(x,n,0.5,tau1sq,10^(-6))
plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square tau1sq=",tau1sq),type='l',ylim=c(-2,2))
points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}
n=3
par(mfrow=c(4,1))
for (tau1sq in c(0.01,0.1,0.5,1)){
gpm52_fit=gp_m52(x,n,0.5,tau1sq,10^(-6))
plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 tau1sq=",tau1sq),type='l',ylim=c(-2,2))
points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}
Varying \(\tau_2^2\) in \((10^(-6),0.01,0.1,0.5,1)\)
n=3
par(mfrow=c(4,1))
for (tau2sq in c(10^(-6),0.01,0.1,0.5)){
gpes_fit=gp_es(x,n,0.5,0.5,tau2sq)
plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square tau1sq=",tau1sq),type='l',ylim=c(-2,2))
points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}
n=3
par(mfrow=c(4,1))
for (tau2sq in c(10^(-6),0.01,0.1,0.5)){
gpm52_fit=gp_m52(x,n,0.5,0.5,tau2sq)
plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 tau2sq=",tau2sq),type='l',ylim=c(-2,2))
points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}
\[ \left( \begin{array}{c} f(x*) \\ f(x) \end{array} \right) \sim N\left( \left( \begin{array}{c} \mu^* \\ \mu \end{array} \right), \left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x) \end{array} \right) \right) \]
The \((i,j)\) entry of \(C(x,x)\) is applying covariance \(C\) function to \(x_i\), \(x_j\), i.e, \(C(x_i,x_j)\).\
For the ease of notation, Call \(f(x*)=f*\), \(f(x)=f\).\
The conditional of \(f^*|f\) is $f^*|f N(‘,’) $ where \[\mu' =\mu^* +C(x^*,x)C(x,x)^{-1}(f-\mu)\] \[\Sigma=C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*)\]
:\ We know \((f^*,f)\) is jointly normal, with density \[p(f^*,f) \propto exp\left(-\frac{1}{2}[(f^*-\mu^*)^T,(f-\mu)^T]\Sigma^{-1}\left[\begin{array}{cc} f^*-\mu^* \\ f-\mu \end{array}\right]\right)\] where \[\Sigma=\left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x) \end{array} \right)\] \[\Sigma^{-1}=\Omega=\left( \begin{array}{cc} \Omega_{11} & \Omega_{12} \\ \Omega_{21} & \Omega_{22} \end{array} \right) =\left[ \begin{array}{cc} (\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1} &-(\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1}\Sigma_{12}\Sigma_{22}^{-1}\\ -\Sigma_{22}^{-1}\Sigma_{12}^T(\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1} &(\Sigma_{22}-\Sigma_{12}^T\Sigma_{11}^{-1}\Sigma_{12})^{-1} \end{array} \right] \]
working on a log scale, \[\log p(f^*,f) \propto -\frac{1}{2}[(f^*-\mu^*)^T,(f-\mu)^T] \left[\begin{array}{cccc} \Omega_{11} &\Omega_{12}\\ \Omega_{12}^T &\Omega_{22} \end{array}\right] \left[\begin{array}{cc} f^*-\mu^* \\ f-\mu \end{array}\right] \]
\[\log p(f^*,f) \propto -\frac{1}{2}\left[(f^*-f)^T\Omega_{11}(f^*-\mu^*)+2(f^*-\mu^*)^T\Omega_{12}(f-\mu)+(f-\mu)^T\Omega_{22}(f-\mu)\right]\]
We know the marginal distribution of \(f\) is \[p(f)=N(\mu,\Sigma_{22})\] Thus \[p(f^*|f)=\frac{p(f^*,f)}{p(f)}\] \[\begin{align} \log(p(f^*|f) \propto -\frac{1}{2}[&(f^*-\mu^*)^T\Omega_{11}(f^*-\mu^*)+2(f^*-\mu^*)^T\Omega_{12}(f-\mu)+(f-\mu)^T\Omega_{22}(f-\mu) \\ &-(f-\mu)^T\Sigma_{22}^{-1}(f-\mu) ]) \end{align}\]Thus, \[f^*-\mu^*|f \sim N(\Omega_{11}^{-1}\Omega_{12}(f-\mu),\Omega_{11}^{-1})\] which means, \[f^*|f \sim N(\mu^*+\Omega_{11}^{-1}\Omega_{12}(f-\mu), \Omega_{11}^{-1})\]
Plug in the expression for \(\Omega\) and \(\Sigma\), \[\Omega_{11}^{-1}\Omega_{12}=\Sigma_{12}\Sigma_{22}^{-1}=C(x^*,x)C(x,x)^{-1}\] \[\Omega_{11}^{-1}=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T=C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*)\]
Put together, \[f^*|f \sim N(\mu^*+C(x^*,x)C(x,x)^{-1}(f-\mu),C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*))\]
Thus we have, \[X\sim MVN(A\mu_1+B\mu_2,A\Sigma_1A^T+B\Sigma_2B^T)\] In this case where \[\left( \begin{array}{c} y \\ \theta \end{array} \right)=A\theta+B\epsilon\] where \[A=\left( \begin{array}{c} R \\ I \end{array} \right), B=\left( \begin{array}{c} I \\ 0 \end{array} \right) \] Thus, \[\left( \begin{array}{c} y \\ \theta \end{array} \right) \sim MVN(Am,AVA^T+B\Sigma B^T)\]
\(cov(y_p,y_q)=C(x_p,x_q)+\sigma^2\delta_{pq}\), thus, \[cov(y)=C(x,x)+\sigma^2 I.\] Then the joint distribution is \[ \left( \begin{array}{c} f(x*) \\ y \end{array} \right) \sim N\left( \left( \begin{array}{c} 0 \\ 0 \end{array} \right), \left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x)+\sigma^2I \end{array} \right) \right) \] As we already derived in Question 4 Part B, the conditional distribution is \[f^*|y \sim N(C(x^*,x)(C(x,x)+\sigma^2I)^{-1}y,C(x^*,x^*)-C(x^*,x)*(C(x,x)+\sigma^2 I)^{-1}C(x,x^*))\] Thus, the posterior expectation can be written into the the form of the linear smoother as \[E(f^*|y)=C(x^*,x)(C(x,x)+\sigma^2I)^{-1}y=\sum_i \alpha_i K(x_i,x^*)\] where \[\alpha=(C(x,x)+\sigma^2 I)y\]
#read in data
rm(list=ls())
set.seed(123)
library(mvtnorm)
library(plot3D)
library(lattice)
setwd("/Users/qiaohui.lin/Desktop/StatModeling2/")
data=read.csv("utilities.csv",header = TRUE,sep=',')
y=data$gasbill/data$billingdays
x=data$temp
x=x+rnorm(length(x),0,1)
n=length(y)
# define kernal function
Exp2Sigma <- function(x,b,tau1sq, tau2sq){
C <- matrix(0,length(x),length(x))
for(i in 1:length(x)) {
for(j in 1:length(x)) {
C[i,j] <- tau1sq*exp(-0.5*{(x[i]-x[j])/b}^2) + tau2sq*(x[i]==x[j])
}
}
return(C)
}
#initializing parameters
b=20
tau1sq=5
tau2sq=10^(-3)
C=Exp2Sigma(x,b,tau1sq,tau2sq)
sigma2=var(y)
Sigma=sigma2*diag(n)
Sigma_inv=(1/sigma2)*diag(n)
#calculate posterior mean and variance and uppler lower bound
post_mean=solve(Sigma_inv+solve(C))%*% Sigma_inv %*% y
post_var=solve(Sigma_inv+solve(C))
post_lb= post_mean-1.96*sqrt(diag(post_var))
post_ub= post_mean+1.96*sqrt(diag(post_var))
#plot
plot(x[order(x)],post_mean[order(x)],col='red',ylim=c(0,8),type='l')
points(x[order(x)],post_lb[order(x)],type='l',col='blue')
points(x[order(x)],post_ub[order(x)],type='l',col='blue')
points(x[order(x)],y[order(x)],pch=20)